This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
library(ggplot2)
library(plyr)
library(readr)
library(dplyr)
library(glmnet)
library(tibble)
library(caret)
library(RColorBrewer)
library(pheatmap)
library(dslabs)
library(forcats)
library(reshape2)
library(tidyr)
day = read.csv("/Users/Murphy1/Desktop/DATA2020/Project2020/data/hour.csv", header = TRUE)
head(day)
day2 <- head(day, 8646)
plot <- ggplot(data = day2, aes(x = day2$dteday, y = day2$cnt)) +
geom_point(color = "brown") +
xlab('date') +ylab('count')
plot + theme(axis.ticks.x=element_blank(),
axis.text.x=element_blank()) +
ggtitle('check seasonaility')
NA
#select variables season, workingday, weathersit, temp, atemp, hum, windspeed
day1 <- dplyr::select(day, -c(instant, dteday))
head(day1)
plot <- ggplot(data = day1, aes(x = season, y = cnt, fill = factor(day1$season)))
plot + geom_boxplot() + scale_fill_brewer(palette = "YlOrBr") +
ylab('count') +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
ggtitle('count vs. seasonality')
plot <- ggplot(day1, aes(x = day1$cnt)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "wheat") +
geom_density(color = "brown") + xlab('count')
plot + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
ggtitle('distribution of count')
day1 <- lapply(day1, function(x) as.numeric(x))
day1 <- data.frame(day1)
y <- day1$cnt
X <- dplyr::select(day1, -c(cnt))
corr <- round(cor(X),2)
get_upper_tri<-function(corr){
corr[lower.tri(corr, diag=TRUE)] <- NA
return(corr)
}
upper <- get_upper_tri(corr)
melted_upper <- melt(upper, na.rm=TRUE)
ggplot(data = melted_upper, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "wheat", high = "brown", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab") +
theme(
axis.text.x = element_text(angle = 45, vjust = 1,
size = 4, hjust = 1),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank())+
geom_text(aes(Var2, Var1, label = value), color = "gray", size = 2.8) +
coord_fixed() +
labs(fill='correlation') + ggtitle('correlation heatmap')
#drop atemp
day1 <- dplyr::select(day1, -c(atemp))
day1
# Check Line
fit.all = lm(cnt ~ season + yr + mnth + hr + holiday + workingday + weathersit + temp + hum + windspeed, data = day1)
intercept_only <- lm(cnt ~ 1, data=day1)
forward <- step(intercept_only, direction='forward',
scope=formula(fit.all))
Start: AIC=180764.7
cnt ~ 1
Df Sum of Sq RSS AIC
+ temp 1 93677759 478083832 177657
+ hr 1 88790198 482971393 177834
+ hum 1 59618351 512143240 178853
+ yr 1 35876722 535884870 179641
+ season 1 18127040 553634551 180207
+ weathersit 1 11598301 560163290 180411
+ mnth 1 8321115 563440476 180512
+ windspeed 1 4970060 566791531 180615
+ holiday 1 546889 571214702 180750
+ workingday 1 524387 571237204 180751
<none> 571761591 180765
Step: AIC=177657
cnt ~ temp
Df Sum of Sq RSS AIC
+ hr 1 66728221 411355610 175046
+ hum 1 49874585 428209247 175744
+ yr 1 31342263 446741569 176481
+ windspeed 1 6021342 472062489 177439
+ weathersit 1 5880680 472203151 177444
+ season 1 1696802 476387030 177597
+ mnth 1 906463 477177369 177626
+ holiday 1 225697 477858135 177651
<none> 478083832 177657
+ workingday 1 35467 478048365 177658
Step: AIC=175046.4
cnt ~ temp + hr
Df Sum of Sq RSS AIC
+ yr 1 32229070 379126540 173631
+ hum 1 25434149 385921461 173939
+ weathersit 1 5638993 405716617 174809
+ season 1 2995571 408360039 174921
+ windspeed 1 1712372 409643238 174976
+ mnth 1 1525503 409830107 174984
+ holiday 1 260174 411095436 175037
+ workingday 1 54016 411301595 175046
<none> 411355610 175046
Step: AIC=173630.5
cnt ~ temp + hr + yr
Df Sum of Sq RSS AIC
+ hum 1 20865128 358261412 172649
+ weathersit 1 5240157 373886383 173391
+ season 1 3515639 375610901 173471
+ mnth 1 1811568 377314972 173549
+ windspeed 1 1810495 377316045 173549
+ holiday 1 307712 378818828 173618
+ workingday 1 66617 379059923 173629
<none> 379126540 173631
Step: AIC=172648.7
cnt ~ temp + hr + yr + hum
Df Sum of Sq RSS AIC
+ season 1 7325038 350936374 172292
+ mnth 1 4840141 353421271 172414
+ holiday 1 367007 357894405 172633
+ weathersit 1 133898 358127514 172644
+ workingday 1 117604 358143808 172645
<none> 358261412 172649
+ windspeed 1 15517 358245895 172650
Step: AIC=172291.7
cnt ~ temp + hr + yr + hum + season
Df Sum of Sq RSS AIC
+ holiday 1 371167 350565206 172275
+ windspeed 1 165365 350771008 172286
+ workingday 1 131868 350804506 172287
<none> 350936374 172292
+ weathersit 1 37972 350898402 172292
+ mnth 1 799 350935575 172294
Step: AIC=172275.3
cnt ~ temp + hr + yr + hum + season + holiday
Df Sum of Sq RSS AIC
+ windspeed 1 165343 350399863 172269
+ workingday 1 47102 350518104 172275
+ weathersit 1 42022 350523184 172275
<none> 350565206 172275
+ mnth 1 0 350565206 172277
Step: AIC=172269.1
cnt ~ temp + hr + yr + hum + season + holiday + windspeed
Df Sum of Sq RSS AIC
+ weathersit 1 73738 350326125 172267
+ workingday 1 48197 350351667 172269
<none> 350399863 172269
+ mnth 1 1 350399863 172271
Step: AIC=172267.5
cnt ~ temp + hr + yr + hum + season + holiday + windspeed + weathersit
Df Sum of Sq RSS AIC
+ workingday 1 53900 350272225 172267
<none> 350326125 172267
+ mnth 1 1 350326124 172269
Step: AIC=172266.8
cnt ~ temp + hr + yr + hum + season + holiday + windspeed + weathersit +
workingday
Df Sum of Sq RSS AIC
<none> 350272225 172267
+ mnth 1 1.92 350272223 172269
summary(forward)
Call:
lm(formula = cnt ~ temp + hr + yr + hum + season + holiday +
windspeed + weathersit + workingday, data = day1)
Residuals:
Min 1Q Median 3Q Max
-366.86 -93.52 -27.97 60.40 644.46
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -9.4775 6.5712 -1.442 0.149246
temp 282.8600 6.0073 47.086 < 2e-16 ***
hr 7.6962 0.1649 46.680 < 2e-16 ***
yr 80.9848 2.1668 37.376 < 2e-16 ***
hum -197.1206 6.8663 -28.708 < 2e-16 ***
season 20.0853 1.0486 19.154 < 2e-16 ***
holiday -25.1349 6.6610 -3.773 0.000162 ***
windspeed 29.5888 9.4015 3.147 0.001651 **
weathersit -3.7797 1.9044 -1.985 0.047187 *
workingday 3.9204 2.3980 1.635 0.102096
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 142 on 17369 degrees of freedom
Multiple R-squared: 0.3874, Adjusted R-squared: 0.3871
F-statistic: 1220 on 9 and 17369 DF, p-value: < 2.2e-16
#forward
backward <- step(fit.all, direction='backward')
Start: AIC=172268.8
cnt ~ season + yr + mnth + hr + holiday + workingday + weathersit +
temp + hum + windspeed
Df Sum of Sq RSS AIC
- mnth 1 2 350272225 172267
<none> 350272223 172269
- workingday 1 53901 350326124 172269
- weathersit 1 79432 350351655 172271
- windspeed 1 199754 350471977 172277
- holiday 1 286716 350558939 172281
- season 1 2453586 352725809 172388
- hum 1 16562243 366834466 173070
- yr 1 28170183 378442406 173611
- hr 1 43896453 394168676 174319
- temp 1 44245313 394517536 174334
Step: AIC=172266.8
cnt ~ season + yr + hr + holiday + workingday + weathersit +
temp + hum + windspeed
Df Sum of Sq RSS AIC
<none> 350272225 172267
- workingday 1 53900 350326125 172267
- weathersit 1 79442 350351667 172269
- windspeed 1 199752 350471977 172275
- holiday 1 287145 350559370 172279
- season 1 7398820 357671045 172628
- hum 1 16620569 366892794 173070
- yr 1 28171966 378444191 173609
- hr 1 43943958 394216183 174319
- temp 1 44711469 394983694 174353
summary(backward)
Call:
lm(formula = cnt ~ season + yr + hr + holiday + workingday +
weathersit + temp + hum + windspeed, data = day1)
Residuals:
Min 1Q Median 3Q Max
-366.86 -93.52 -27.97 60.40 644.46
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -9.4775 6.5712 -1.442 0.149246
season 20.0853 1.0486 19.154 < 2e-16 ***
yr 80.9848 2.1668 37.376 < 2e-16 ***
hr 7.6962 0.1649 46.680 < 2e-16 ***
holiday -25.1349 6.6610 -3.773 0.000162 ***
workingday 3.9204 2.3980 1.635 0.102096
weathersit -3.7797 1.9044 -1.985 0.047187 *
temp 282.8600 6.0073 47.086 < 2e-16 ***
hum -197.1206 6.8663 -28.708 < 2e-16 ***
windspeed 29.5888 9.4015 3.147 0.001651 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 142 on 17369 degrees of freedom
Multiple R-squared: 0.3874, Adjusted R-squared: 0.3871
F-statistic: 1220 on 9 and 17369 DF, p-value: < 2.2e-16
day2 <- day1 %>% dplyr::select(-c(casual, registered))
#day2
day2$cnt <- log(day2$cnt)
#day2
x <- model.matrix(day2$cnt~., day2)[,-1]
y <- day2 %>% dplyr::select(cnt) %>% unlist() %>% as.numeric()
#y
grid <- 10^seq(2, -3, by=-.1)
train <- day2 %>% sample_frac(0.8)
test <- day2 %>% setdiff(train)
x_train <- model.matrix(train$cnt~., train)[,-1]
x_test <- model.matrix(test$cnt~., test)[,-1]
cnt <- day2$cnt
#cnt
y_train <- train %>% dplyr::select(cnt) %>% unlist() %>% as.numeric()
y_test <- test %>% dplyr::select(cnt) %>% unlist() %>% as.numeric()
set.seed(1)
#on the full model
out <- glmnet(x, y, alpha = 1, lambda = grid)
#cv to choose best lambda
cv.out <- cv.glmnet(x_train, y_train, alpha = 1)
bestlam <- cv.out$lambda.min
cat('Best lambda obtained is: ', bestlam)
Best lambda obtained is: 0.002401653
#lasso with best lambda
lasso <- glmnet(x, y, alpha = 1, lambda = bestlam)
predictions <- predict(lasso, type = "coefficients", s = bestlam)
#coef(lasso, s=bestlam)
#remove intercept
lasso.mod <- glmnet(x,y, alpha =1)
beta <- coef(lasso.mod)
beta <- as.matrix(beta)
dim(beta)
[1] 12 64
lambda <- lasso.mod$lambda
length(lambda)
[1] 64
lam <- rep(lambda, 11)
length(lam)
[1] 704
beta_t <- t(beta)
beta_vec <- as.vector(beta_t)[-c(1:64)]
length(beta_vec)
[1] 704
v <- rownames(beta)[-1]
length(v)
[1] 11
names <- rep(v, each=64)
length(beta_vec)
[1] 704
length(names)
[1] 704
df <- data.frame(lam, names, beta_vec)
plot <- ggplot(df, aes(x = lam, y = beta_vec, color = names)) +
geom_line() +
labs(x = "log(lambda)", y = "Coefficient") +
theme_bw() +
scale_x_continuous(trans='log10')
plot + ggtitle('Lasso coefficients vs. lambda')
#x_test
lasso_pred <- predict(lasso, newx = x_test, s = bestlam)
#y_test
#lasso_pred
mae <- mean(abs(y_test - lasso_pred))
mse <- mean((y_test - lasso_pred)^2)
r_squared <- 1 - (sum((y_test- lasso_pred)^2) / sum((y_test - mean(y_test))^2))
mae
[1] 0.8257472
mse
[1] 1.101616
r_squared
[1] 0.4764416
day1$cnt <- log(day1$cnt)
mod_forward <- lm(formula = cnt ~ temp + hr + yr + hum + season + holiday +
windspeed + weathersit + workingday, data = day1)
plot(mod_forward, which = c(1,2,3,4))
mod_backward <- lm(formula = cnt ~ season + yr + hr + holiday + workingday +
weathersit + temp + hum + windspeed, data = day1)
plot(mod_backward, which = c(1,2,3,4))
# Check the variance
library(car)
ncvTest(fit.all)
Non-constant Variance Score Test
Variance formula: ~ fitted.values
Chisquare = 2063.008, Df = 1, p = < 2.22e-16
set.seed(42)
train_index <- createDataPartition(day1$cnt, p = 0.8, list = FALSE)
train_set <- day1[train_index,]
test_set <- day1[-train_index,]
# Train the multiple linear regression model
model <- lm(formula = cnt ~ season + yr + hr + holiday + workingday +
weathersit + temp + hum + windspeed, data = train_set)
# Make predictions on the test set
predictions <- predict(model, newdata = test_set)
# Evaluate the model
mae <- mean(abs(test_set$cnt - predictions))
mse <- mean((test_set$cnt - predictions)^2)
r_squared <- 1 - (sum((test_set$cnt - predictions)^2) / sum((test_set$cnt - mean(test_set$cnt))^2))
cat('Mean Absolute Error:', mae, '\n')
Mean Absolute Error: 0.8430607
cat('Mean Squared Error:', mse, '\n')
Mean Squared Error: 1.156954
cat('R-squared:', r_squared, '\n')
R-squared: 0.4713828
rmse <- function(actual, predicted) {
return(sqrt(mean((actual - predicted)^2)))
}
rmse_value <- rmse(test_set$cnt, predictions)
cat('Root Mean Squared Error:', rmse_value, '\n')
Root Mean Squared Error: 1.075618
plot(cnt ~ temp + hr + yr + hum + season + holiday +
windspeed + weathersit + workingday, data = day1)
day2 <- day1[ , -which(names(day1) == "cnt")]
pca <- prcomp(day2,scale=TRUE)
variance <- summary(pca)$importance[2,]
df <- data.frame(Component = 1:length(variance), Variance = variance)
df_top_10 <- df[1:10,]
df_top_10 <- df_top_10[order(-df_top_10$Variance),]
df_top_10
pc <- pca$rotation[, 1: 10]
z <- melt(pc)
colnames(z) <- c("Variable", "PC", "value")
plot <- ggplot(z, aes(x = PC, y = Variable, fill= value)) +
geom_tile(color = 'white')+
scale_fill_gradient2(low = 'brown', high = 'wheat', mid = 'ivory',
midpoint = 0, limit = c(-1,1), space = 'Lab')
plot
pca_data <- as.data.frame(predict(pca))
df2 <- pca_data[, 1:2]
df1 <- subset(day1, select = cnt)
combined_df <- cbind(df1, df2)
train_index <- createDataPartition(combined_df$cnt, p = 0.8, list = FALSE)
train_set <- combined_df[train_index,]
test_set <- combined_df[-train_index,]
# Train the multiple linear regression model
model2 <- lm(cnt ~ ., data = train_set)
summary(model2)
Call:
lm(formula = cnt ~ ., data = train_set)
Residuals:
Min 1Q Median 3Q Max
-3.9418 -0.4606 0.1181 0.6162 2.7952
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.537284 0.007966 569.55 <2e-16 ***
PC1 0.720840 0.005032 143.24 <2e-16 ***
PC2 0.113545 0.005553 20.45 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9394 on 13901 degrees of freedom
Multiple R-squared: 0.601, Adjusted R-squared: 0.601
F-statistic: 1.047e+04 on 2 and 13901 DF, p-value: < 2.2e-16
# Make predictions on the test set
predictions <- predict(model2, newdata = test_set)
# Evaluate the model
mae <- mean(abs(test_set$cnt - predictions))
mse <- mean((test_set$cnt - predictions)^2)
r_squared <- 1 - (sum((test_set$cnt - predictions)^2) / sum((test_set$cnt - mean(test_set$cnt))^2))
summary(predictions)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.916 3.689 4.405 4.555 5.292 8.573
r_squared
[1] 0.590504
mae
[1] 0.7150052
mse
[1] 0.8998416
ggplot(df_top_10, aes(x = factor(Component), y = Variance)) +
geom_bar(stat = "identity", fill = "wheat") +
scale_x_discrete(labels = 1:10) +
theme(panel.background = element_rect(fill = "white")) +
labs(x = "Principal Component", y = "Proportion of Variance Explained",
title = "Proportion of Variance Explained by Top 10 Principal Components")+
theme(legend.position = "none")